Explore how spillover and compensation affect correlations and phenograph clusters in a cyTOF dataset. This will reproduce, among other plots, the following figures from the paper “Compensation of signal spillover in suspension and imaging mass cytometry”: * Figure 3, A-E * Figure S4, A-C
A 36-antibody panel was used to detect the main immune cell populations in PBMCs (Paper Table S1). This panel was not optimized to avoid spillover effects and contained identical antibodies in different mass channels to facilitate the identification of spillover artifacts. The spillover artefacts will be investigated by using a compensation matrix estimated by stingle stained bead experiments at the same day as the sample.
options(knitr.root.dir = normalizePath('../'))
library(flowCore)
library(Rtsne)
library(Rphenograph)
library(CATALYST)
library(dplyr)
library(data.table)
library(dtplyr)
library(RColorBrewer)
library(gplots)
#library(Scale)
#optimal leaf ordering of hierarchical clustering:
library(cba)
library(Hmisc)
library(mclust)
flowFrame2dt <- function (datFCS){
# converts a flow frame to a data table
dt = flowCore::exprs(datFCS)
colnames(dt) = make.names(make.unique(colnames(dt)))
fil = !is.na(datFCS@parameters@data$desc)
colnames(dt[, fil]) <- datFCS@parameters@data$desc[fil]
dt <- data.table::data.table(dt)
uninames = make.names(make.unique(datFCS@parameters@data$name))
uni_newnames = make.names(make.unique(datFCS@parameters@data$desc))
data.table::setnames(dt, uninames[fil], uni_newnames[fil])
return(dt)
}
censor_dat <- function (x, quant = 0.999){
# censors the upper limit vector's value using the provided quantile
q = quantile(x, quant)
x[x > q] = q
return(x)
}
# the FCS file used
fn_cells = '../data/Figure_2-3/Bead_PBMC_staining/Exp2/160805_Exp3_cells-only.fcs'
name_condition = 'Exp3'
# the file containing the spillover matirx generated by CATALYST
fn_sm = c('../data/Figure_2-3/Bead_PBMC_staining/Exp2/160805_Exp3_spillover_matrix.csv')
folder_out = '../data'
do_load = T
# subsample cellnumber
nsamp = 20000
transf = function(x) asinh(x/5)
# defines some channel names that should not be considered
bad_channels = c( "CD3" , "CD45","SampleID","Time", "Event_length", "MCB102", "MCB104", "MCB105", "MCB106",
"MCB108","MCB110","MCB113","MCB115" ,"Beads140" , "File.Number",
"DNA193", "DNA191" , "Live194", "Live195" , "beadDist", "barcode",
"89Y" , "102Pd_BC1" , "104Pd_BC2" ,"105Pd_BC3" ,"106Pd_BC4","108Pd_BC5", "110Pd_BC6" ,
"113In_BC7", "115In_BC8", "138Ba","140Ce", "191Ir_DNA1" ,
"193Ir_DNA2","195Pt","196Pt","208Pb", "Center","Offset","Width",
"Residual" , "102Pd" , "103Rh" , "104Pd" , "105Pd" , "106Pd","108Pd",
"110Pd", "113In","115In" ,"157Gd" ,"155Gd-CD273",'File-Number'
)
col_list <- c("#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72",
"#B17BA6", "#FF7F00", "#FDB462", "#E7298A", "#E78AC3",
"#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D",
"#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999",
"#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000",
"#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00")
dat_sms = lapply(fn_sm, function(fn) read.csv(fn, row.names = 1))
names(dat_sms) = sapply(fn_sm, function(fn) basename(fn))
ff <- flowCore::read.FCS(fn_cells)
## the text section does not end with delimiter: \\. The last keyword is dropped.
## the text section does not end with delimiter: \\. The last keyword is dropped.
set.seed(1234)
if (!is.na(nsamp)){
ff = ff[sample(nrow(ff), nsamp)]
}
es <- flowCore::exprs(ff)
dats_compmeta = data.table(compensation= c('raw',paste(names(dat_sms), '_clas'), paste(names(dat_sms),'_nnls')))
dats_compmeta[, is_raw := compensation == 'raw']
ff_list = list()
ff_list[[1]] = ff
for (i in seq_along(dat_sms)){
tsm = dat_sms[[i]]
# helps with numerical instabilities sometimes observed during saving/loading
tsm = round(tsm, digits=8)
ff_list[[i+1]] = CATALYST::compCytof(ff, tsm)
}
for (i in seq_along(dat_sms)){
tsm = dat_sms[[i]]
# There seem some numerical instabilities when loading the data (e.g. -5.596894e-33).
# rounding fixes this issue.
tsm = round(tsm, digits=8)
ff_list[[i+(length(dat_sms)+1)]] = CATALYST::compCytof(ff, tsm, method='nnls')
}
names(ff_list) <- dats_compmeta$compensation
This is not necessary, but personal preference
tidy_ff <- function(x, nm_condition){
x = flowFrame2dt(x)
x[, id:=paste(as.character(1:.N), nm_condition)]
x = melt.data.table(x, id.vars = 'id',variable.name = 'channel', value.name='counts', variable.factor = F)
x[, counts_transf:= transf(counts)]
setkey(x, id)
return(x)
}
dats= lapply(ff_list, function(x) tidy_ff(x, name_condition))
clean_channels = function(x){
x[, channel :=as.character(gsub('X','',as.character(channel))) , by=channel]
x[, channel :=gsub('\\.','-',as.character(.BY)) , by=channel]
x[, channel :=gsub(' ','-',as.character(.BY)) , by=channel]
return(x)
}
dats= lapply(dats, clean_channels)
# for additional, global cell specific information, e.g. condition
datcell = dats[['raw']] %>%
dplyr::select(-c(counts, counts_transf, channel)) %>%
dplyr::filter(!duplicated(id)) %>%
mutate(condition= name_condition)
setkey(datcell, id)
datmeta = data.table(name=ff@parameters$name, channel=ff@parameters$desc)
datmeta = clean_channels(datmeta)
setkey(datmeta, channel)
good_channels =unique(datmeta$channel)[!unique(datmeta$channel) %in% bad_channels]
datmeta[,antibody:=gsub('.....-','',channel)]
#datmeta[antibody=='CD8b',antibody:='CD8']
datmeta[,n_chan := length(unique(channel)), by=antibody]
datmeta[,metal:=substr(channel, 4, 5)]
datmeta[metal =='', metal:= as.character(1:.N)]
# adapted from https://github.com/lmweber/FlowSOM-Rtsne-example/blob/master/FlowSOM_Rtsne_example.R
do_phenograph<- function(data, channels, valuevar= 'counts_transf', seed=1234, subsample=FALSE){
# A helper function
pheno_dat = data.table::dcast.data.table(data[channel %in% channels, ],id~channel,value.var = valuevar)
all_ids = pheno_dat$id
if (subsample == FALSE){
subsample=1
}
sampids = pheno_dat[, sample(id, floor(.N*subsample),replace = F)]
pheno_dat_samp = pheno_dat[id %in%sampids, ]
ids =pheno_dat_samp$id
pheno_dat_samp[, id:=NULL]
set.seed(seed)
rpheno_out = Rphenograph::Rphenograph(pheno_dat_samp)
cluster = igraph::membership(rpheno_out[[2]])
pheno_clust = data.table::data.table(cluster)
pheno_clust[, id:=ids]
pheno_clust[, cluster:=factor(cluster)]
data.table::setkey(pheno_clust, 'id')
pheno_clust = pheno_clust[all_ids ,]
return(pheno_clust)
}
datspheno= lapply(dats, function(dat) do_phenograph(dat, good_channels, valuevar = 'counts_transf', seed=1234))
## Run Rphenograph starts:
## -Input data of 20000 rows and 35 columns
## -k is set to 30
## Finding nearest neighbors...DONE ~ 11.081 s
## Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 5.705 s
## Build undirected graph from the weighted links...DONE ~ 3.035 s
## Run louvain clustering on the graph ...DONE ~ 5.195 s
## Run Rphenograph DONE, totally takes 25.016s.
## Return a community class
## -Modularity value: 0.8855295
## -Number of clusters: 20
## Run Rphenograph starts:
## -Input data of 20000 rows and 35 columns
## -k is set to 30
## Finding nearest neighbors...DONE ~ 12.07 s
## Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 6.846 s
## Build undirected graph from the weighted links...DONE ~ 3.41 s
## Run louvain clustering on the graph ...DONE ~ 6.313 s
## Run Rphenograph DONE, totally takes 28.639s.
## Return a community class
## -Modularity value: 0.8796457
## -Number of clusters: 18
## Run Rphenograph starts:
## -Input data of 20000 rows and 35 columns
## -k is set to 30
## Finding nearest neighbors...DONE ~ 10.706 s
## Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 7.997 s
## Build undirected graph from the weighted links...DONE ~ 3.068 s
## Run louvain clustering on the graph ...DONE ~ 5.913 s
## Run Rphenograph DONE, totally takes 27.684s.
## Return a community class
## -Modularity value: 0.8882629
## -Number of clusters: 22
calc_tsne <- function (input_dat, channels, value_var = "counts", channel_var = "channel",
id_var = "id", scale = F, verbose = T, dims = 2, seed=1234,...){
set.seed(seed)
ids = input_dat[!duplicated(get(id_var)), get(id_var)]
dt = data.table::dcast(input_dat[(get(channel_var) %in%
channels) & get(id_var) %in% ids], formula = as.formula(paste(id_var,
"~", channel_var)), value.var = value_var)
if (scale) {
tsnedat = scale(dt[, channels, with = F])
}
else {
tsnedat = dt[, channels, with = F]
}
tsne_out <- Rtsne::Rtsne(tsnedat, verbose = verbose, dims = dims,
...)
tsne_out$Y = data.table(tsne_out$Y)
setnames(tsne_out$Y, names(tsne_out$Y), paste("bh", names(tsne_out$Y),
sep = "_"))
tsne_out$Y$id = dt[, get(id_var)]
data.table::setnames(tsne_out$Y, "id", id_var)
data.table::setkeyv(tsne_out$Y, id_var)
tsne_out$channels = channels
tsne_out$scale = F
return(tsne_out)
}
# Setup some colors for plotting
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
The TSNE map is calculated on the uncompensated values
ttsne = calc_tsne(dats[['raw']], good_channels, value_var = 'counts_transf', channel_var = "channel")
## Read the 20000 x 35 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 20000
## - point 10000 of 20000
## Done in 15.68 seconds (sparsity = 0.006750)!
## Learning embedding...
## Iteration 50: error is 104.475015 (50 iterations in 61.51 seconds)
## Iteration 100: error is 101.438325 (50 iterations in 106.46 seconds)
## Iteration 150: error is 87.019123 (50 iterations in 51.82 seconds)
## Iteration 200: error is 85.054767 (50 iterations in 45.46 seconds)
## Iteration 250: error is 84.248226 (50 iterations in 49.76 seconds)
## Iteration 300: error is 3.666333 (50 iterations in 42.68 seconds)
## Iteration 350: error is 3.412136 (50 iterations in 40.81 seconds)
## Iteration 400: error is 3.238534 (50 iterations in 41.64 seconds)
## Iteration 450: error is 3.116084 (50 iterations in 34.63 seconds)
## Iteration 500: error is 3.023403 (50 iterations in 39.01 seconds)
## Iteration 550: error is 2.950851 (50 iterations in 35.33 seconds)
## Iteration 600: error is 2.892485 (50 iterations in 38.26 seconds)
## Iteration 650: error is 2.843872 (50 iterations in 39.39 seconds)
## Iteration 700: error is 2.802840 (50 iterations in 43.52 seconds)
## Iteration 750: error is 2.767604 (50 iterations in 35.85 seconds)
## Iteration 800: error is 2.737041 (50 iterations in 38.32 seconds)
## Iteration 850: error is 2.710190 (50 iterations in 34.85 seconds)
## Iteration 900: error is 2.686808 (50 iterations in 36.63 seconds)
## Iteration 950: error is 2.666254 (50 iterations in 33.68 seconds)
## Iteration 1000: error is 2.648096 (50 iterations in 39.57 seconds)
## Fitting performed in 889.18 seconds.
This corresponds to figure S4C
tclust = datspheno[['raw']]
tdat = dats[['raw']]
p = ggplot(subset(tdat, !duplicated(id))[ttsne$Y][tclust], aes(x=bh_V1, y=bh_V2))+
geom_point(size=0.3, alpha=1, aes(color=as.factor(cluster)))+
scale_color_manual(values = col_list)+
ggtitle('Figure S4C')+
guides(color=guide_legend(override.aes=list(size=5)))+
theme(strip.background = element_blank(),
panel.background=element_rect(fill='white', colour = 'black'),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank(),
legend.key = element_blank())
p
ggsave(filename = file.path(folder_out, 'figS4c_phenograph-on-tsne.pdf'),plot=p,width=10, height=5)
The phenograph based on the uncompensated data is used for all future plots
clusterdat = datspheno[['raw']]
In the following plots the asinh(x/5) transformed intensities were normalized between 0-1 by using the 0.99 percentile of the data. Based on these plots the Fig 3A as well as Fig S4A-B were assembled.
Note that also the classical compensation is plotted, but due to a fixed color scheme not allowing for negatives, it does not get plotted correctly.
ptsne = ttsne
pdats = dats
for (tdatname in names(pdats)){
tdat = pdats[[tdatname]]
tdat[, c_counts := censor_dat(counts_transf,0.99), by=channel]
tdat[, c_counts_scaled := c_counts, by=channel]
tdat[, c_counts_scaled := (c_counts_scaled/max(c_counts_scaled)), by=channel]
p = ggplot(subset(tdat, channel %in% good_channels)[ptsne$Y],
aes(x=bh_V1, y=bh_V2, color=c_counts_scaled))+
facet_wrap(~channel, scales = "free", ncol = 9)+
geom_point(alpha=0.5, size=0.3)+
scale_color_gradientn(colours=rev(brewer.pal(11, 'Spectral')), name='Counts', limits=c(0, 1))+
ggtitle(tdatname)+
theme(strip.background = element_blank(),
strip.text.x = element_text(size = 11),
axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank())
print(p)
#ggsave(filename = file.path(folder_out, paste(tdatname, 'tsne_all_markers.png',sep='_')), plot = p,width=15, height=6, dpi = 300)
}
The chunck below defines some convenience functions
get_summarydat <- function(ssdat, clusterdat, valuevar){
# calculates summary statistics per cluster
summary_dat = ssdat[clusterdat][ ,list(
median_val = median(get(valuevar), na.rm=T),
mean_val = median(get(valuevar), na.rm=T),
cell_cluster=.N
), by=.(channel,cluster)]
return(summary_dat)
}
dat2mat <- function(data, formula, valuevar){
# calculates a matrix suitable for a heatmap from the long data
hm_dat = dcast.data.table(data =data, formula =formula,
value.var = valuevar)
trownames = hm_dat$cluster
# convert to a matrix
hm_dat = as.matrix(hm_dat[,-1,with=F])
row.names(hm_dat) = trownames
return(hm_dat)
}
This reproduces Figure 3C For each phenograph cluster the cluster median is calculated and a heatmap is generated. The all heatmap are clustered based on the uncompensated data in order to make them visually easily comparable. The colorscake corresponds to the asinh(x/5) transformed count and is kept constant for all the plots to make them directly comparable.
firstclust = 1
pdats = dats
for (i in seq_along(pdats)){
cur_name = names(pdats)[i]
hm_dat = pdats[[i]][channel %in% good_channels,] %>%
get_summarydat(clusterdat, 'counts_transf') %>%
dat2mat('cluster ~ channel','median_val')
if (firstclust){
firstclust=F
tdist = as.dist(1-cor(t(hm_dat), method="pearson"))
tdist[is.na(tdist)] = 0
hr <- hclust(tdist, method="ward.D")
co_r <- order.optimal(tdist, hr$merge)
hr$merge = co_r$merge
hr$order = co_r$order
tdist = as.dist(1-cor((hm_dat), method="pearson"))
tdist[is.na(tdist)] = 0
hc <- hclust(tdist, method="ward.D")
co_c <- order.optimal(tdist, hc$merge)
hc$merge = co_c$merge
hc$order = co_c$order
}
if (any(hm_dat<0)){
cols = rev(brewer.pal(11,'RdBu'))
cmap = colorRampPalette(cols)
} else {
cols = rev(brewer.pal(11,'RdBu'))
cmap = colorRampPalette(cols[6:11])
}
heatmap.2(hm_dat,
scale ='none',
trace = "none",
col=cmap(75),
Rowv=as.dendrogram(hr),
#RowSideColors=sel_col_vector[group_levels],
#dendrogram='column',
Colv=as.dendrogram(hc),
density.info ='none',
#Colv =NA,
#keyorient=2,
xlab = 'Conditions',
#sepwidth = c(0,0),
cexRow=0.9,
cexCol=0.9,
margins=c(4,4),
main=cur_name
#ylab ='Genes',
#comments = data.frame(names = row.names(tab_Prot))
)
}
These heatmaps use arcsinh(x/5) scaled data. There is clearly some spillover vanishing upon compensation. After compensation the same antibodies stained in the different channels look much more comparable. With the ‘classical’ compensation a slight overcompensation is visible that is not visible an the NNLS compensation.
Extra figure not used; It would be interesting how these heatmaps looks on a linear scale. Thus instead of arcsinh scaling, each channel is scalled by dividing it through the maximal cluster median of the untransformed data. Thus the scale is linear and comparable between the plots:
firstclust = 1
pdats = dats
for (i in seq_along(pdats)){
cur_name = names(pdats)[i]
hm_dat = pdats[[i]][channel %in% good_channels,] %>%
get_summarydat(clusterdat, 'counts') %>%
dat2mat('cluster ~ channel','median_val')
if (firstclust){
normvec = apply(hm_dat,2,function(x) max(abs(x), na.rm = T))
}
hm_dat = t(t(hm_dat)/normvec)
if (firstclust){
firstclust=F
print(1)
tdist = as.dist(1-cor(t(hm_dat), method="pearson"))
tdist[is.na(tdist)] = 0
hr <- hclust(tdist, method="ward.D")
co_r <- order.optimal(tdist, hr$merge)
hr$merge = co_r$merge
hr$order = co_r$order
tdist = as.dist(1-cor((hm_dat), method="pearson"))
tdist[is.na(tdist)] = 0
hc <- hclust(tdist, method="ward.D")
co_c <- order.optimal(tdist, hc$merge)
hc$merge = co_c$merge
hc$order = co_c$order
}
if (any(hm_dat<0, na.rm = T)){
cols = rev(brewer.pal(11,'RdBu'))
cmap = colorRampPalette(cols)
} else {
cols = rev(brewer.pal(11,'RdBu'))
cmap = colorRampPalette(cols[6:11])
}
heatmap.2(hm_dat,
scale ='none',
trace = "none",
col=cmap(75),
Rowv=as.dendrogram(hr),
#RowSideColors=sel_col_vector[group_levels],
#dendrogram='column',
Colv=as.dendrogram(hc),
density.info ='none',
#Colv =NA,
#keyorient=2,
xlab = 'Conditions',
#sepwidth = c(0,0),
cexRow=0.9,
cexCol=0.9,
margins=c(4,4),
main=cur_name
#ylab ='Genes',
#comments = data.frame(names = row.names(tab_Prot))
)
}
## [1] 1
-> Also with a linear scale the spillover is clearly visible and also correctly compensated.
get_cormat <- function(data, xcol, ycol, valuecol, method='pearson', pval = F){
# Helper function to calculate a correlation matrix from the data
cormat = dcast.data.table(data, paste(xcol, ycol,sep='~'), value.var = valuecol) %>%
dplyr::select(-matches(xcol)) %>%
as.matrix() %>%
rcorr(type=method)
if (pval == F){
cormat = cormat$r
}
return(cormat)
}
The following section a) calculates the spearman correlation between all markers and their significance within the single cells of each cluser with more than 200 cells.
The correlation heatmaps correspond to Figure 3E of the paper. The heatmaps are clustered according to the correlation structure of the uncompensated data to ensure direct visual comparability. We choose spearman correlation as it is completly independent from the transformation used. Pearson correlation gives similar results however for asinh transformed counts. The p-values heatmap correspond to the log10(p-value) of the corresponding correlation value.
p_co = 0.005
mincells =200
p_clusts = c(12)
doprint = F
corcluster = list()
cols = rev(brewer.pal(11,'RdBu'))
cmap = colorRampPalette(cols)
pdats = dats
clusts = clusterdat[, list(n=.N),by=cluster][n>mincells, unique(cluster)]
for (clust in clusts){
firstclust = 1
for (i in seq_along(dats)){
cur_name = paste(c(names(dats)[i], as.character(clust)), collapse = ' ')
cormat =pdats[[i]][clusterdat[cluster == clust, id], ][channel %in% good_channels,] %>%
get_cormat(xcol='id', ycol='channel', valuecol='counts_transf',method = 'spearman', pval = T)
cormatp = cormat$P
cormat = cormat$r
corcluster[[length(corcluster)+1]] = data.table(cluster=clust,
comp= names(pdats)[i],
nsig=sum(cormatp <p_co, na.rm=T)/2,
nstrong = (sum(abs(cormat) > 0.5, na.rm=T)-nrow(cormat))/2)
if ((clust %in% p_clusts) | is.null(p_clusts)){
if (firstclust){
firstclust=F
#print(1)
tdist = dist(t(cormat), method="euclidean")
tdist[is.na(tdist)] = 0
hr <- hclust(tdist, method="ward.D")
co_r <- order.optimal(tdist, hr$merge)
hr$merge = co_r$merge
hr$order = co_r$order
cormatco = cormatp < p_co
}
heatmap.2(cormat,
scale ='none',
trace = "none",
col=cmap(75),
symm=T,
Rowv=as.dendrogram(hr),
main=paste('Correlation:',cur_name),
margins = c(10,10)
)
cmap = colorRampPalette(cols)
x=-log10(cormatp)
x[x>5] = 5
heatmap.2(x,
scale ='none',
trace = "none",
col=cmap(75),
symm=T,
Rowv=as.dendrogram(hr),
Colv=as.dendrogram(hr),
main=paste('p-values:', cur_name),
margins = c(10,10)
)
}
}
}
corcluster = rbindlist(corcluster)
Based on the data generated above, the change in number of significant single cell marker correlations per cluster is investigated. This is the basis for figure 3C.
Not shown in the paper is the ‘classical’ compensation, which leads to sometimes quite a significant amount of artifical correlations due to slight overcompen. This is another argument for the NNLS.
corcluster_sel=corcluster
pdat = corcluster_sel[ ,nsig_frac := nsig/nsig[comp == 'raw'] , by=cluster]
pdat[, cluster_p := factor(x=as.character(cluster), levels = as.character(sort(as.numeric(unique(cluster)))))]
pdat%>%
ggplot(aes(x=comp, y=nsig_frac, color=cluster_p, group=cluster))+
# quite ugly code to keep the cluster colors consistent
scale_color_manual(values = col_list[sort(as.numeric(as.character(unique(pdat$cluster_p))))])+
geom_point(stat='summary', fun.y=sum) +
stat_summary(fun.y=sum, geom="line")+
theme(axis.text.x = element_text(angle = 90, hjust = 1),
strip.background = element_blank(),
panel.background=element_rect(fill='white', colour = 'black'),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
plot.background=element_blank(),
legend.key = element_blank())+
ggtitle('Figure 3C')+
expand_limits(y=0)
#ggsave(filename = file.path(folder_out, 'correlation_loss.pdf'),width=3, height=5,useDingbats=FALSE)
Same plot as above, but just showing the raw number of significant correlations.
ggplot(corcluster, aes(x=comp, y=nsig, color=cluster, group=cluster))+
#facet_wrap(~data, ncol = 1)+
geom_point(stat='summary', fun.y=sum) +
stat_summary(fun.y=sum, geom="line")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
expand_limits(y=0)
The following heatmap investigates the pearson correlation of the marker medians among the clusters:
firstclust = 1
pdats = dats
for (i in seq_along(pdats)){
cur_name = names(pdats)[i]
cormat = pdats[[i]][channel %in% good_channels,] %>%
get_summarydat(clusterdat, 'counts') %>%
get_cormat(xcol='cluster', ycol='channel', valuecol='median_val', method='pearson')
if (firstclust){
firstclust=F
print(1)
tdist = dist(t(cormat), method="euclidean")
tdist[is.na(tdist)] = 0
hr <- hclust(tdist, method="ward.D")
co_r <- order.optimal(tdist, hr$merge)
hr$merge = co_r$merge
hr$order = co_r$order
}
cmap = colorRampPalette(cols)
heatmap.2(cormat,
scale ='none',
trace = "none",
col=cmap(75),
symm=T,
Rowv=as.dendrogram(hr),
main=cur_name
)
}
## [1] 1
-> There is a clear change in correlation, however it is tedious to judge by eye if it makes sense. Thus the section bellow specifically looks into ‘within antibody’ correlation and ‘within metal’ correlations.
In the experiment we have multiple occasions of the same antibody coupled to different metal isotopes. We would expect that upon compensation this correlations are increasing. Further in general correlations within different antibodies using isotopes from the same metal should rather decrease after compensation. However this need not to be the case as the antibodies using the same metal still could be biologically correlated.
get_within_between_groupcorr <- function(cormat, grouptab, col_name, col_group){
# cormat is a named correlation matrix
# grouptab is a data.table containing groups and names
# col_name is the name in the grouptab corresponding to the correlation matrix names
# col_group is the name in the grouptab corresonding to the grouping
tcormat = cormat
## Na correlation indicates that a value stays constant. I argue that this is eqivalent to '0' correlation in our case
# as there would be no relation ship detected
tcormat[is.na(tcormat)] <- 0
# ignore the diagonal:
diag(tcormat) <- NA
# create a temporary filter variable
channnames = colnames(tcormat)
corgroup_dat = grouptab[, .(fil=list(channnames %in% get(col_name))), by=c(col_group)]
# keep only antibodies with more than one occurences
corgroup_dat = corgroup_dat[sapply(fil, function(x) sum(x, na.rm = T) >1),]
# calculate within and between
corgroup_dat[, ':='(within_cor=mean(abs(tcormat[fil[[1]],fil[[1]]]), na.rm = T)
),by=c(col_group)]
# delete the filter variable
corgroup_dat[, fil:=NULL]
return(corgroup_dat)
}
# define the 'fisher transform' scale: this is the variance stabilizing transform for correlations
# -> I think that might be appropriate to use here
scale_atanh = scales::trans_new('arctanh', atanh, tanh, breaks =scales::pretty_breaks() )
Do all the calculations:
tcorr_tabs_antibody = list()
tcorr_tabs_metal = list()
pdats = dats
for (i in seq_along(pdats)){
cur_name = names(pdats)[i]
cormat = pdats[[i]][channel %in% good_channels,] %>%
get_summarydat(clusterdat, 'counts') %>%
get_cormat(xcol='cluster', ycol='channel', valuecol='median_val')
# within antibodies
tcorr_tab = get_within_between_groupcorr(cormat = cormat, grouptab = datmeta, col_name = 'channel',col_group = 'antibody')
tcorr_tab[, dataset:=cur_name]
tcorr_tabs_antibody = append(tcorr_tabs_antibody,list(tcorr_tab) )
# for within metals
tcorr_tab = get_within_between_groupcorr(cormat = abs(cormat), grouptab = datmeta, col_name = 'channel',col_group = 'metal')
tcorr_tab[, dataset:=cur_name]
tcorr_tabs_metal = append(tcorr_tabs_metal,list(tcorr_tab) )
}
corr_tab_antibody = data.table::rbindlist(tcorr_tabs_antibody)
corr_tab_metal= data.table::rbindlist(tcorr_tabs_metal)
-> This plot looks at the pearson correlation between the median counts of the Phenograph clusters before compensation (raw) and afterwards.
ggplot(corr_tab_antibody, aes(x=dataset, y=within_cor, color=as.factor(antibody), group=antibody))+
scale_y_continuous(trans = scale_atanh, breaks = c(0.5,0.8,0.90,0.975,0.99,0.999, 0.99999))+
expand_limits(y=0.99)+
geom_point()+
geom_line()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
-> As expected and visible in the heatmaps, the correlation between same antibodies labeled with different metals increases after compensation (in particular CD20: 0.5 -> 0.9)
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/lapack/liblapack.so.3.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] mclust_5.3 Hmisc_4.0-3 Formula_1.2-2
## [4] survival_2.41-3 lattice_0.20-35 cba_0.2-19
## [7] proxy_0.4-17 gplots_3.0.1 RColorBrewer_1.1-2
## [10] dtplyr_0.0.2 data.table_1.10.4-1 dplyr_0.7.4
## [13] CATALYST_1.1.5 Rphenograph_0.99.1 igraph_1.1.2
## [16] ggplot2_2.2.1 Rtsne_0.13 flowCore_1.42.3
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-131 bitops_1.0-6 matrixStats_0.53.0
## [4] pbkrtest_0.4-7 httr_1.3.1 rprojroot_1.2
## [7] tools_3.4.1 backports_1.1.1 R6_2.2.2
## [10] rpart_4.1-11 KernSmooth_2.23-15 lazyeval_0.2.0
## [13] BiocGenerics_0.22.1 mgcv_1.8-22 colorspace_1.3-2
## [16] nnet_7.3-12 gridExtra_2.3 compiler_3.4.1
## [19] graph_1.54.0 quantreg_5.33 Biobase_2.36.2
## [22] htmlTable_1.9 SparseM_1.77 plotly_4.7.1
## [25] sandwich_2.4-0 labeling_0.3 checkmate_1.8.4
## [28] caTools_1.17.1 scales_0.5.0 DEoptimR_1.0-8
## [31] nnls_1.4 mvtnorm_1.0-6 robustbase_0.92-7
## [34] stringr_1.2.0 digest_0.6.15 foreign_0.8-69
## [37] minqa_1.2.4 rmarkdown_1.6 base64enc_0.1-3
## [40] rrcov_1.4-3 pkgconfig_2.0.1 htmltools_0.3.6
## [43] lme4_1.1-14 plotrix_3.7 htmlwidgets_1.0
## [46] rlang_0.1.2 shiny_1.0.5 bindr_0.1
## [49] zoo_1.8-0 jsonlite_1.5 gtools_3.5.0
## [52] acepack_1.4.1 car_2.1-5 magrittr_1.5
## [55] Matrix_1.2-11 Rcpp_0.12.15 munsell_0.4.3
## [58] stringi_1.1.5 multcomp_1.4-8 yaml_2.1.16
## [61] MASS_7.3-47 plyr_1.8.4 gdata_2.18.0
## [64] parallel_3.4.1 splines_3.4.1 knitr_1.17
## [67] corpcor_1.6.9 reshape2_1.4.3 codetools_0.2-15
## [70] stats4_3.4.1 glue_1.1.1 drc_3.0-1
## [73] evaluate_0.10.1 latticeExtra_0.6-28 nloptr_1.0.4
## [76] httpuv_1.3.5 MatrixModels_0.4-1 gtable_0.2.0
## [79] RANN_2.5.1 purrr_0.2.3 tidyr_0.7.1
## [82] assertthat_0.2.0 mime_0.5 xtable_1.8-2
## [85] largeVis_0.2.1 pcaPP_1.9-72 viridisLite_0.2.0
## [88] tibble_1.3.4 bindrcpp_0.2 cluster_2.0.6
## [91] TH.data_1.0-8